*0_addtl_data_prep_june2022*
*created: 12-9-18
*updated: 3-9-20 by Katelyn Heath, 6-21-22 by Meredith Welch 
*authors: Sebastian Montenegro, Katelyn Heath, Scott Imberman, Meredith Welch 
*This code runs does some additional data prep to run 1_prelim_results_earnings do file.

/** SECTION 0: SET UP **/ 

global  outdir "/srv/tier1/projects/180_major/Majors"
global root "/srv/tier1/projects/180_major/Majors"
global data $root/1_data_cleaning/data 
global output_desc $root/2_data_analysis/desc_stats
global output_results $root/2_data_analysis/results
global tables $output_results/tables
global plots $output_results/plots 

cap log close
log using $output_results/0_addtl_data_prep_june2022.log, replace

clear all 
set more off
set emptycells drop 
set matsize 11000
set seed 131524
set sortseed 54651
/** SECTION 0: SET UP CONTROL MACROS **/ 

*COMBINED SECTOR MAJORS MACRO
local major "ag com it voc eng arch bio sci soc bus und"
*Note: these are the control variables from Andrews, Li, Lovenheim (2014) Table 2 
local controls "z_math z_reading rank_90_math rank_7090_math rank_90_reading rank_7090_reading male white hispanic black asian gift atrisk econ_disad"
local controls_comma "z_math, z_reading, rank_90_math, rank_7090_math, rank_90_reading, rank_7090_reading, male, white, hispanic, black, asian, gift, atrisk, econ_disad"
*DIFFERENT SECTORS MAJORS MACRO
local  majors "ag_4yr com_4yr it_4yr voc_4yr eng_4yr arch_4yr bio_4yr sci_4yr soc_4yr bus_4yr und_4yr ag_2yr com_2yr it_2yr voc_2yr eng_2yr arch_2yr bio_2yr sci_2yr soc_2yr bus_2yr educ_2yr und_2yr"
local  majors_2yr "ag_2yr com_2yr it_2yr voc_2yr eng_2yr arch_2yr bio_2yr sci_2yr soc_2yr bus_2yr educ_2yr und_2yr"
local  majors_4yr "ag_4yr com_4yr it_4yr voc_4yr eng_4yr arch_4yr bio_4yr sci_4yr soc_4yr bus_4yr und_4yr"
local  all_2yr "ag_2yr com_2yr it_2yr voc_2yr eng_2yr arch_2yr bio_2yr sci_2yr soc_2yr bus_2yr und_2yr z_math z_reading rank_90_math rank_7090_math rank_90_reading rank_7090_reading white hispanic black asian gift atrisk econ_disad first_earn_year first_earn_qtr last_earn_year last_earn_qtr cohort.1996 cohort.1997 cohort.1998 cohort.1999 cohort.2000 cohort.2001 cohort.2002 cohort.2003 cohort.2004 cohort.2005"
local  all_4yr "ag_4yr com_4yr it_4yr voc_4yr eng_4yr arch_4yr bio_4yr sci_4yr soc_4yr bus_4yr und_4yr z_math z_reading rank_90_math rank_7090_math rank_90_reading rank_7090_reading white hispanic black asian gift atrisk econ_disad first_earn_year first_earn_qtr last_earn_year last_earn_qtr cohort.1996 cohort.1997 cohort.1998 cohort.1999 cohort.2000 cohort.2001 cohort.2002 cohort.2003 cohort.2004 cohort.2005"

/** SECTION 1: CREATE SAMPLES **/ 

*Open data 
use "$data/tea_thecb_oneobs_twc_panel_cleaned.dta", clear 

**IDENTIFY THOSE WITH A TX PUBLIC BA DEGREE
gen grad_temp = deg_level1 == 3 |  deg_level2 == 3 | deg_level3 == 3 
replace grad_temp = deg_level1 == 4 |  deg_level2 == 4 | deg_level3 == 4 if grad_temp == 0
egen grad_degree = max(grad_temp), by(ssnrep)

gen ba_temp = deg_level1 == 2 | deg_level2 == 2 | deg_level3 == 2
egen ba_degree = max(ba_temp), by(ssnrep)

gen aa_temp = deg_level1 == 1 | deg_level2 == 1 | deg_level3 == 1
egen aa_degree = max(aa_temp), by(ssnrep)

*NOTE INDIVIDUALS WITH A PUBLIC ADVANCED DEGREE BUT NO BA (BECAUSE THEY GOT THEIR BA OUT OF STATE OR PRIVATE) ARE DROPPED
drop if ba_degree == 0 & grad_degree == 1
	
*Need indicators for each individual ever being in 2 yr sector and ever being in 4 yr sector 
	
	*2-yr sector
	gen sector_2yr = 0

	* Place in 2-yr if has an AA degree but no BA degree
	replace sector_2yr = 1 if aa_degree == 1 & ba_degree == 0

	*Place in 2-yr if has no AA degree, no BA degree, enrolled w/ major in 2-yr but not enrolled w/ major in 4-yr
	replace sector_2yr = 1 if lyear_4yr_major1==. & lyear_2yr_major1!=. & ba_degree == 0 & aa_degree == 0
	
	*Place in 2-yr if has no AA degree, no BA or higher degree, last enrolled w/ major in 2-yr
	replace sector_2yr = 1 if lyear_2yr_major1>lyear_4yr_major1 & aa_degree == 0 & ba_degree ==0  &  lyear_2yr_major1!=. &  lyear_4yr_major1!=. 

	*4-yr sector
	gen sector_4yr = 0

	* Place in 4-yr if has a BA degree regradless of AA degree status
	replace sector_4yr = 1 if ba_degree == 1

	*Place in 4-yr if no BA degree, enrolled w/ major in 4-yr but not enrolled w/ major in 2-yr
	replace sector_4yr = 1 if lyear_4yr_major1!=. & lyear_2yr_major1==. & ba_degree == 0
	
	*Place in 4-yr if has no BA_degree, no AA degree, last enrolled w/ major in 4-yr (same year, place in 4-yr)
	replace sector_4yr = 1 if lyear_4yr_major1>=lyear_2yr_major1 & lyear_4yr_major1!=. & aa_degree == 0 & ba_degree == 0 &  lyear_2yr_major1!=.
	

	*A VERY SMALL NUMBER HAVE AN ASSOCIATE DEGREE AND ENROLL IN A 4YEAR BUT HAVE NO MAJOR ATTACHED - WILL DROP THESE
	drop if aa_degree == 1 & lyear_4yr_major1!=. &  lyear_2yr_major1==.
	
*RESTRICT TO ENROLLED STUDENTS
keep if sector_4yr == 1 | sector_2yr == 1

/** SECTION 2: CREATE MAJOR VARIABLES **/ 
*Fix some major codes - 4 digit and 2 digit codes do not match for some aggregate groups (business, lib, soc)
*for second major variable. Some econ/history in wrong category.  

* Generate new 2-digit cip code for second majors
gen sec_maj_2_4yr_str = substr(sec_maj_4_4yr, 1, 2)
gen sec_maj_2_2yr_str = substr(sec_maj_4_2yr, 1, 2)
drop sec_maj_2_4yr sec_maj_2_2yr
destring sec_maj_2_4yr_str, gen(sec_maj_2_4yr) 
destring sec_maj_2_2yr_str, gen(sec_maj_2_2yr) 

* Recode these 2 digit cip codes according to 1_data_cleaning_july2021.do
	* Marketing to Business *
replace sec_maj_2_2yr=52 if sec_maj_2_2yr==8
replace sec_maj_2_4yr=52 if sec_maj_2_4yr==8
	* Cultural and Gender Studies to Liberal Arts *
replace sec_maj_2_2yr=24 if sec_maj_2_2yr==5
replace sec_maj_2_4yr=24 if sec_maj_2_4yr==5
	* Combine Ag and Natural Resources *
replace sec_maj_2_2yr=1 if sec_maj_2_2yr==3 | sec_maj_2_2yr==2
replace sec_maj_2_4yr=1 if sec_maj_2_4yr==3 | sec_maj_2_4yr==2
	* Combine Computer and Tech Support *
replace sec_maj_2_2yr=10 if sec_maj_2_2yr==11
replace sec_maj_2_4yr=10 if sec_maj_2_4yr==11
	* Make Liberal Arts General Category *
replace sec_maj_2_2yr=24 if sec_maj_2_2yr==16 | sec_maj_2_2yr==23 | sec_maj_2_2yr==25 | sec_maj_2_2yr==30 | sec_maj_2_2yr==38 | sec_maj_2_2yr==39 | sec_maj_2_2yr==50 | sec_maj_2_2yr==54
replace sec_maj_2_4yr=24 if sec_maj_2_4yr==16 | sec_maj_2_4yr==23 | sec_maj_2_4yr==25 | sec_maj_2_4yr==30 | sec_maj_2_4yr==38 | sec_maj_2_4yr==39 | sec_maj_2_4yr==50 | sec_maj_2_4yr==54
	* Make Social Science General Category *
replace sec_maj_2_2yr=45 if sec_maj_2_2yr==42 | sec_maj_2_2yr==44 | sec_maj_2_2yr==19  | sec_maj_2_2yr==22
replace sec_maj_2_4yr=45 if sec_maj_2_4yr==42 | sec_maj_2_4yr==44 | sec_maj_2_4yr==19  | sec_maj_2_4yr==22
	* Make Vocation Category *
replace sec_maj_2_2yr=12 if sec_maj_2_2yr==15 | sec_maj_2_2yr==28 | sec_maj_2_2yr==29 | sec_maj_2_2yr==31 | sec_maj_2_2yr==32 | sec_maj_2_2yr==33 | sec_maj_2_2yr==34 | sec_maj_2_2yr==35 | sec_maj_2_2yr==36 | sec_maj_2_2yr==37 | sec_maj_2_2yr==41 | sec_maj_2_2yr==43 | sec_maj_2_2yr==46 | sec_maj_2_2yr==47 | sec_maj_2_2yr==48 | sec_maj_2_2yr==49 | sec_maj_2_2yr==20
replace sec_maj_2_4yr=12 if sec_maj_2_4yr==15 | sec_maj_2_4yr==28 | sec_maj_2_4yr==29 | sec_maj_2_4yr==31 | sec_maj_2_4yr==32 | sec_maj_2_4yr==33 | sec_maj_2_4yr==34 | sec_maj_2_4yr==35 | sec_maj_2_4yr==36 | sec_maj_2_4yr==37 | sec_maj_2_4yr==41 | sec_maj_2_4yr==43 | sec_maj_2_4yr==46 | sec_maj_2_4yr==47 | sec_maj_2_4yr==48 | sec_maj_2_4yr==49 | sec_maj_2_4yr==20
	* Make Engineering + Architecture *
*replace sec_maj_2_2yr=14 if sec_maj_2_2yr==4
*replace sec_maj_2_4yr=14 if sec_maj_2_4yr==4
	* Make Physical Sciences + Math *
replace sec_maj_2_2yr=40 if sec_maj_2_2yr==27
replace sec_maj_2_4yr=40 if sec_maj_2_4yr==27
	* Make Health Category *
replace sec_maj_2_2yr=26 if sec_maj_2_2yr==51 | sec_maj_2_2yr==60
replace sec_maj_2_4yr=26 if sec_maj_2_4yr==51 | sec_maj_2_4yr==60
	* Move History to Liberal Arts *
replace sec_maj_2_2yr=24 if sec_maj_4_2yr=="4508"
replace sec_maj_2_4yr=24 if sec_maj_4_4yr=="4508"
	* Move Econ to Business *
replace sec_maj_2_2yr=52 if sec_maj_4_2yr=="4506"
replace sec_maj_2_4yr=52 if sec_maj_4_4yr=="4506"
	* Drop Technology Education
drop if sec_maj_2_2yr == 21
drop if sec_maj_2_4yr == 21

* Fix first major variable for econ and history 
replace maj_2_4yr = 24 if maj_4_4yr=="4508"
replace maj_2_2yr = 24 if maj_4_2yr=="4508"  

replace maj_2_4yr = 52 if maj_4_4yr=="4506" 
replace maj_2_2yr = 52 if maj_4_2yr=="4506" 

*Create Major/Sector interaction terms 

gen byte ag_2yr = 0 
replace ag_2yr = 1 if maj_2_2yr==1 | sec_maj_2_2yr==1
label var ag_2yr "Agriculture 2yr Sector" 
gen byte com_2yr = 0 
replace com_2yr = 1 if maj_2_2yr==9 | sec_maj_2_2yr==9
label var com_2yr "Communications 2yr Sector"
gen byte it_2yr = 0 
replace it_2yr = 1 if maj_2_2yr==10 | sec_maj_2_2yr==10
label var it_2yr "IT 2yr Sector" 

gen byte voc_2yr = 0 
replace voc_2yr = 1 if maj_2_2yr==12 | sec_maj_2_2yr==12
label var voc_2yr "Vocational 2yr Sector" 
gen byte eng_2yr = 0 
replace eng_2yr = 1 if maj_2_2yr==14 | sec_maj_2_2yr==14
label var eng_2yr "Engineering 2yr Sector"
gen byte arch_2yr = 0 
replace arch_2yr = 1 if maj_2_2yr==4 | sec_maj_2_2yr==4
label var arch_2yr "Architecture 2yr Sector" 
gen byte lib_2yr = 0 
replace lib_2yr = 1 if maj_2_2yr==24 | sec_maj_2_2yr==24
label var lib_2yr "Liberal Arts 2yr Sector" 
gen byte bio_2yr = 0 
replace bio_2yr = 1 if maj_2_2yr==26 | sec_maj_2_2yr==26
label var bio_2yr "Biology + Health 2yr Sector" 
gen byte sci_2yr = 0  
replace sci_2yr = 1 if maj_2_2yr==40 | sec_maj_2_2yr==40
label var sci_2yr "Physical Sciences + Math 2yr Sector" 
gen byte soc_2yr = 0 
replace soc_2yr = 1 if maj_2_2yr==45 | sec_maj_2_2yr==45
label var soc_2yr "Social Sciences 2yr Sector" 
gen byte bus_2yr = 0 
replace bus_2yr = 1 if maj_2_2yr==52 | sec_maj_2_2yr==52
label var bus_2yr "Business + Economics 2yr Sector" 
gen byte educ_2yr = 0 
replace educ_2yr = 1 if maj_2_2yr==13 | sec_maj_2_2yr==13
label var educ_2yr "Education 2yr Sector" 
gen byte und_2yr = 0
replace und_2yr = 1 if maj_2_2yr==99 | sec_maj_2_2yr==99  | (maj_2_2yr == . & sec_maj_2_2yr == .) // This is unlikely to happen, but let's do it anyways
label var und_2yr "Undeclared 2yr Sector"


gen byte ag_4yr = 0 
replace ag_4yr = 1 if maj_2_4yr==1 | sec_maj_2_4yr==1
label var ag_4yr "Agriculture 4yr Sector" 
gen byte com_4yr = 0 
replace com_4yr = 1 if maj_2_4yr==9 | sec_maj_2_4yr==9
label var com_4yr "Communications 4yr Sector"
gen byte it_4yr = 0 
replace it_4yr = 1 if maj_2_4yr==10 | sec_maj_2_4yr==10
label var it_4yr "IT 4yr Sector" 
gen byte voc_4yr = 0 
replace voc_4yr = 1 if maj_2_4yr==12 | sec_maj_2_4yr==12
label var voc_4yr "Vocational 4yr Sector" 
gen byte eng_4yr = 0 
replace eng_4yr = 1 if maj_2_4yr==14 | sec_maj_2_4yr==14
label var eng_4yr "Engineering 4yr Sector"
gen byte arch_4yr = 0 
replace arch_4yr = 1 if maj_2_4yr==4 | sec_maj_2_4yr==4
label var arch_2yr "Architecture 4yr Sector" 
gen byte lib_4yr = 0 
replace lib_4yr = 1 if maj_2_4yr==24 | sec_maj_2_4yr==24
label var lib_4yr "Liberal Arts 4yr Sector" 
gen byte bio_4yr = 0 
replace bio_4yr = 1 if maj_2_4yr==26 | sec_maj_2_4yr==26
label var bio_4yr "Biology + Health 4yr Sector" 
gen byte sci_4yr = 0 
replace sci_4yr = 1 if maj_2_4yr==40 | sec_maj_2_4yr==40
label var sci_4yr "Physical Sciences + Math 4yr Sector" 
gen byte soc_4yr = 0  
replace soc_4yr = 1 if maj_2_4yr==45 | sec_maj_2_4yr==45
label var soc_4yr "Social Sciences 4yr Sector" 
gen byte bus_4yr = 0 
replace bus_4yr = 1 if maj_2_4yr==52 | sec_maj_2_4yr==52
label var bus_4yr "Business + Economics 4yr Sector" 
gen byte und_4yr = 0  
replace und_4yr = 1 if maj_2_4yr==99 | sec_maj_2_4yr==99 | (maj_2_4yr == . & sec_maj_2_4yr == .) 
label var und_4yr "Undeclared 4yr Sector"


*CREATE A SINGLE CONSTRUCTED MAJORS VARIABLE FOR EACH SECTOR TO USE IN MAJOR-SPECIFIC SEASONALITY ADJUSTMENT (SAI 8-6-23)
gen majors_constructed_4yr = .
local count = 0
foreach var of varlist ag_4yr-und_4yr {
	local count = `count' + 1
	replace majors_constructed_4yr = `count' if `var' == 1
}

gen majors_constructed_2yr = .
local count = 0
foreach var of varlist ag_2yr-und_2yr {
	local count = `count' + 1
	replace majors_constructed_2yr = `count' if `var' == 1
}



compress

*** For part 1, need to generate which_maj variable to use in boxplots. This tells us why the person is in the aggregate major group (from first or second major)
gen which_maj_ag_4yr = 1 if maj_2_4yr==1 
replace which_maj_ag_4yr = 2 if sec_maj_2_4yr==1

gen which_maj_com_4yr = 1 if maj_2_4yr==9 
replace which_maj_com_4yr = 2 if sec_maj_2_4yr==9

gen which_maj_it_4yr = 1 if maj_2_4yr==10
replace which_maj_it_4yr = 2 if sec_maj_2_4yr==10

gen which_maj_voc_4yr = 1 if maj_2_4yr==12
replace which_maj_voc_4yr = 2 if sec_maj_2_4yr==12

gen which_maj_eng_4yr = 1 if maj_2_4yr==14
replace which_maj_eng_4yr = 2 if sec_maj_2_4yr==14

gen which_maj_arch_4yr = 1 if maj_2_4yr==4
replace which_maj_arch_4yr = 2 if sec_maj_2_4yr==4

gen which_maj_lib_4yr = 1 if maj_2_4yr==24
replace which_maj_lib_4yr = 2 if sec_maj_2_4yr==24

gen which_maj_bio_4yr = 1 if maj_2_4yr==26
replace which_maj_bio_4yr = 2 if sec_maj_2_4yr==26

gen which_maj_sci_4yr = 1 if maj_2_4yr==40
replace which_maj_sci_4yr = 2 if sec_maj_2_4yr==40

gen which_maj_soc_4yr = 1 if maj_2_4yr==45
replace which_maj_soc_4yr = 2 if sec_maj_2_4yr==45

gen which_maj_bus_4yr = 1 if maj_2_4yr==52
replace which_maj_bus_4yr = 2 if sec_maj_2_4yr==52

gen which_maj_und_4yr = 1 if maj_2_4yr==99  | maj_2_4yr == . 
*SAY THAT TO BE UNDERCLARED IN 2ND MAJOR, YOU MUST ALSO BE UNDECLARED IN 1ST
replace which_maj_und_4yr = 2 if (sec_maj_2_4yr==99  | sec_maj_2_4yr == .) &  (maj_2_4yr==99  | maj_2_4yr == .)

gen which_maj_ag_2yr = 1 if maj_2_2yr==1 
replace which_maj_ag_2yr = 2 if sec_maj_2_2yr==1

gen which_maj_com_2yr = 1 if maj_2_2yr==9 
replace which_maj_com_2yr = 2 if sec_maj_2_2yr==9

gen which_maj_it_2yr = 1 if maj_2_2yr==10
replace which_maj_it_2yr = 2 if sec_maj_2_2yr==10

gen which_maj_voc_2yr = 1 if maj_2_2yr==12
replace which_maj_voc_2yr = 2 if sec_maj_2_2yr==12

gen which_maj_eng_2yr = 1 if maj_2_2yr==14
replace which_maj_eng_2yr = 2 if sec_maj_2_2yr==14

gen which_maj_arch_2yr = 1 if maj_2_2yr==4
replace which_maj_arch_2yr = 2 if sec_maj_2_2yr==4

gen which_maj_lib_2yr = 1 if maj_2_2yr==24
replace which_maj_lib_2yr = 2 if sec_maj_2_2yr==24

gen which_maj_bio_2yr = 1 if maj_2_2yr==26
replace which_maj_bio_2yr = 2 if sec_maj_2_2yr==26

gen which_maj_sci_2yr = 1 if maj_2_2yr==40
replace which_maj_sci_2yr = 2 if sec_maj_2_2yr==40

gen which_maj_soc_2yr = 1 if maj_2_2yr==45
replace which_maj_soc_2yr = 2 if sec_maj_2_2yr==45

gen which_maj_bus_2yr = 1 if maj_2_2yr==52
replace which_maj_bus_2yr = 2 if sec_maj_2_2yr==52

gen which_maj_educ_2yr = 1 if maj_2_2yr==13
replace which_maj_educ_2yr = 2 if sec_maj_2_2yr==13

gen which_maj_und_2yr = 1 if maj_2_2yr==99 | maj_2_2yr == . 
replace which_maj_und_2yr = 2 if (sec_maj_2_2yr==99  | sec_maj_2_2yr == .) &  (maj_2_2yr==99  | maj_2_2yr == .)



/** SECTION 3: CREATE VARIABLES FOR SELECTION INTO EARNINGS SAMPLE **/ 

/** MW NEW SECTION TO INCORPORATE VARIABLES NEEDED FOR HETEROGENEITY AND ZERO EARNINGS ANALYSES **/ 

***For part 2, need to look at 0-earnings spells in the panel data 

* Create variables for number of quarters in earnings data, number of quarters with non-zero earnings, number of quarters with zero earnings, and number of quarters with zero earnings where not enrolled in college
* Note: wage_adj is recoded to 0 when simulatneously enrolled in college as indicated by col_earn_ind==1
sort ssnrep qtr_since_hs
gen twc_sample = wage !=. 
by ssnrep: egen qtrs_in_twc = total(twc_sample)

gen qtr_nonzero_earn = wage_adj>0 
gen qtr_zero_earn = wage_adj==0 
gen qtr_zero_earn_noenroll = wage_adj==0 & col_earn_ind==0

by ssnrep: egen qtrs_nonzero_earn = total(qtr_nonzero_earn)  
by ssnrep: egen qtrs_zero_earn = total(qtr_zero_earn) 
by ssnrep: egen qtrs_zero_earn_noenroll = total(qtr_zero_earn_noenroll)

label variable twc_sample "In TWC sample (wage not missing)"
label variable qtrs_in_twc "Number of quarters with non-missing earnings (wage)"
label variable qtr_nonzero_earn "Quarter has nonzero earnings (wage_adj)"
label variable qtr_zero_earn "Quarter has zero earnings (wage_adj)"
label variable qtr_zero_earn_noenroll "Quarter has zero earnings and not enrolled (wage_adj)"
label variable qtrs_nonzero_earn "Number of quarters with non-zero earnings (wage_adj)"
label variable qtrs_zero_earn "Number of quarters with zero earnings (wage_adj)"
label variable qtrs_zero_earn_noenroll "Number of quarters with zero earnings, excluding those where enrolled in college"
label variable qtr_individual "Total number of quarters in data"

* Create variable for ever drop out of earnings (excluding quarters where enrolled in college)
gen twc_drop = qtrs_zero_earn_noenroll > 0  
/*preserve 
collapse (mean) twc_drop qtr_individual qtrs_in_twc qtrs_nonzero_earn qtrs_zero_earn qtrs_zero_earn_noenroll, by(ssnrep)
tab twc_drop 
sum qtrs_in_twc qtrs_nonzero_earn
tab qtrs_zero_earn_noenroll
tab qtrs_nonzero_earn 
restore */ 

* Check that those with only 0 earnings are enrolled in those quarters.
*tab col_earn_ind if qtrs_nonzero_earn==0, m 
*br ssnrep cohort year qtr wage wage_adj col_earn_ind time first_earn_year first_earn_qtr last_earn_year last_earn_qtr qtr_since_hs insample_wage qtr_cohort qtr_individual twc_sample qtr_nonzero_earn qtr_zero_earn qtr_zero_earn_noenroll if qtrs_nonzero_earn==0



save "$data/tea_thecb_oneobs_twc_panel_cleaned_new.dta", replace 


  ***SAI 8-3-2023 --- CREATE AN ANNUAL WAGE AND DERIVE COEFFICIENT OF VARIATION BASED ON THAT TO ADDRESS R1 POINT 3 COMMENT ON SEASONALITY
  frame create annual
  frame change annual
  use "$data/tea_thecb_oneobs_twc_panel_cleaned_new.dta", clear
  gen year_since_hs = int(qtr_since_hs/4)
  keep ssnrep wage_adj insample_wage year_since_hs
  
  *REMOVE YEARS WITH ONLY PARTIAL WAGE DATA (NOTE - ZEROS ARE INCLUDED IF IN BETWEEN FIRST AND LAST OBSERVED WAGE IN TX)
  duplicates tag ssnrep year_since_hs, gen(dup)
  keep if dup == 3
  drop dup
  
  *COLLAPSE TO ANNUAL DATA
  collapse (mean) wage_adj (min) insample_wage, by(ssnrep year_since_hs)
  
  
 
  gegen numwageobs=sum(insample_wage), by(ssnrep)

	*GENERATE VARIANCE OF TIME
	gegen temp=sd(year_since_hs) if insample_wage==1, by(ssnrep)
	gen vart_wage=temp^2
	drop temp
	
	***RESTRICT TO 5 YEARS (20 QUARTERS) AFTER HS --> 
	keep if year_since_hs > 5

	gegen tbar_wage=mean(year_since_hs) if insample_wage==1, by(ssnrep)
	gegen wbar_wage=mean(wage_adj) if insample_wage==1, by(ssnrep)
	gen temp=(wage_adj-wbar_wage)*(year_since_hs-tbar_wage) if insample_wage==1
	gegen temp2=sum(temp) if insample_wage==1, by(ssnrep)
	gen covar_wage=temp2/(numwageobs-1) if insample_wage==1
	drop temp temp2

	gen beta_wage=covar_wage/vart_wage if insample_wage==1
	gen alpha_wage=wbar_wage-beta_wage*tbar_wage if insample_wage==1

	gen wagehat_wage = alpha_wage+beta_wage*year_since_hs
	gen wagehat_wage_negative = wagehat_wage<0 if wagehat_wage != .


	*DEV OF ANNUAL WAGE RELATIVE TO TREND
	gen dev_wagehat_annual = wage_adj-wagehat_wage if insample_wage==1 & wagehat_wage > 0
	label variable dev_wagehat_annual "DEV = wage_adj-wagehat_wage; avg annual wages over year_since_hs"
	
	
	*COLLAPSE TO SSNREP SPECIFIC DATASET
	collapse (mean) wage_adj_annual = wage_adj (sd) sd_wagehat_annual = dev_wagehat_annual, by(ssnrep)
	gen coeff_var_wagehat_annual = sd_wagehat_annual/wage_adj_annual
	label variable coeff_var_wagehat_annual "CV = sd(wage_adj-wagehat_wage)/wage_adj"
	
	save  "$data/coeff_var_annual.dta", replace 
	frame change default
	
	
/** SECTION 4: CREATE SMALLER DATASETS
	       1. FULL PANEL WITH FEWER VARS 
	       2. COLLAPSED TO ONE OBS PER INDIVIDUAL - USED FOR ANALYSIS **/
	       
	       
use "$data/tea_thecb_oneobs_twc_panel_cleaned_new.dta", clear 
	       
*4yr
 preserve
 keep if sector_4yr == 1 
 

 
 keep ssnrep z_math z_reading rank_90_math rank_7090_math rank_90_reading rank_7090_reading male white hispanic black asian gift atrisk econ_disad first_earn_year first_earn_qtr last_earn_year last_earn_qtr cohort lib_4yr  ag_4yr com_4yr it_4yr voc_4yr eng_4yr arch_4yr bio_4yr sci_4yr soc_4yr bus_4yr und_4yr maj_* sec_maj_* sec_maj_* which_maj_* qtr_since_hs firstq_hs lastq_hs wage wage_adj* wage_5to10* wage_10to15* wage_15to20* insample* alpha* beta* wbar* wagehat* col* campus first_col* last_col* deg_level* twc_sample qtrs_in_twc qtr_nonzero_earn qtr_zero_earn qtr_zero_earn_noenroll qtrs_nonzero_earn qtrs_zero_earn qtrs_zero_earn_noenroll qtr_individual twc_drop dev_* wage_adj*
 
 
 save "$data/full_data_4yr.dta", replace
 
	***COLLAPSE TO ONE OBS PER INDIVIDUAL FOR ANALYSES OF GROWTH RATES (BETA_HS) AND MEAN EARNINGS (WBAR)
	collapse (mean)   z_math z_reading rank_90_math rank_7090_math rank_90_reading rank_7090_reading male white hispanic black asian gift atrisk econ_disad first_earn_year first_earn_qtr last_earn_year last_earn_qtr cohort lib_4yr  ag_4yr com_4yr it_4yr voc_4yr eng_4yr arch_4yr bio_4yr sci_4yr soc_4yr bus_4yr und_4yr which_maj_* firstq_hs lastq_hs wage_5to10* wage_10to15* wage_15to20* insample* alpha* beta* wbar* wagehat* col* campus first_col* last_col* qtrs_in_twc qtrs_nonzero_earn qtrs_zero_earn qtrs_zero_earn_noenroll qtr_individual twc_drop wage_adj* (firstnm) maj_* sec_maj_*  (max) deg_level* (sd) dev_*, by(ssnrep)

	rename dev_wage_ma_2 dev_ma_2
	rename dev_wage_ma_4 dev_ma_4
	rename dev_wage_ma_8 dev_ma_8
	
	rename dev_wage_ma_2_short dev_ma_2_short
	rename dev_wage_ma_4_short dev_ma_4_short
	rename dev_wage_ma_8_short dev_ma_8_short
	
	*RENAME DEVIATION TO STANDARD DEVIATION AND CREATE COEFFICIENT OF VARIATION
	foreach predicted in "wagehat_trend" "ma_4" "ma_8" {
		rename dev_`predicted' sd_`predicted'
		label variable sd_`predicted' "sd(wage_adj - `predicted')"
		gen coeff_var_`predicted' = sd_`predicted'/wage_adj_`predicted'
		label variable coeff_var_`predicted'  "CV = sd(wage_adj - `predicted')/wage_adj"
	
		rename dev_`predicted'_short sd_`predicted'_short
		gen coeff_var_`predicted'_short = sd_`predicted'_short/wage_adj_`predicted'_short
		label variable coeff_var_`predicted'_short  "CV = sd(wage_adj - `predicted')/wage_adj - shortened sample"
	}
		
	merge 1:1 ssnrep using "$data/coeff_var_annual.dta"
	keep if _merge == 1 | _merge == 3
	drop _merge
	
	
	
	save "$data/collapsed_data_4yr.dta", replace
 
 
 *2yr
 restore
 keep if sector_2yr == 1
 
 
 
 keep  ssnrep z_math z_reading rank_90_math rank_7090_math rank_90_reading rank_7090_reading male white hispanic black asian gift atrisk econ_disad first_earn_year first_earn_qtr last_earn_year last_earn_qtr cohort lib_2yr  ag_2yr com_2yr it_2yr voc_2yr eng_2yr arch_2yr bio_2yr sci_2yr soc_2yr bus_2yr educ_2yr und_2yr maj_* sec_maj_* which_maj_* qtr_since_hs firstq_hs lastq_hs wage  wage_adj* wage_5to10* wage_10to15* wage_15to20* insample* alpha* beta* wbar* wagehat* col* campus first_col* last_col* deg_level* twc_sample qtrs_in_twc qtr_nonzero_earn qtr_zero_earn qtr_zero_earn_noenroll qtrs_nonzero_earn qtrs_zero_earn qtrs_zero_earn_noenroll qtr_individual twc_drop dev_* wage_adj*
 save "$data/full_data_2yr.dta", replace
 
	 ***COLLAPSE TO ONE OBS PER INDIVIDUAL FOR ANALYSES OF GROWTH RATES (BETA_HS) AND MEAN EARNINGS (WBAR)
	collapse (mean)   z_math z_reading rank_90_math rank_7090_math rank_90_reading rank_7090_reading male white hispanic black asian gift atrisk econ_disad first_earn_year first_earn_qtr last_earn_year last_earn_qtr cohort lib_2yr ag_2yr com_2yr it_2yr voc_2yr eng_2yr arch_2yr bio_2yr sci_2yr soc_2yr bus_2yr educ_2yr und_2yr which_maj_* firstq_hs lastq_hs wage_5to10* wage_10to15* wage_15to20* insample* alpha* beta* wbar* wagehat* col* campus first_col* last_col* qtrs_in_twc qtrs_nonzero_earn qtrs_zero_earn qtrs_zero_earn_noenroll qtr_individual twc_drop wage_adj* (firstnm) maj_* sec_maj_* (max) deg_level* (sd) dev_* , by(ssnrep)

	rename dev_wage_ma_2 dev_ma_2
	rename dev_wage_ma_4 dev_ma_4
	rename dev_wage_ma_8 dev_ma_8
	
	rename dev_wage_ma_2_short dev_ma_2_short
	rename dev_wage_ma_4_short dev_ma_4_short
	rename dev_wage_ma_8_short dev_ma_8_short
	
	
	
	*RENAME DEVIATION TO STANDARD DEVIATION AND CREATE COEFFICIENT OF VARIATION
	foreach predicted in "wagehat_trend" "ma_4" "ma_8" {
		rename dev_`predicted' sd_`predicted'
		label variable sd_`predicted' "sd(wage_adj - `predicted')"
		gen coeff_var_`predicted' = sd_`predicted'/wage_adj_`predicted'
		label variable coeff_var_`predicted'  "CV = sd(wage_adj - `predicted')/wage_adj"
	
		rename dev_`predicted'_short sd_`predicted'_short
		gen coeff_var_`predicted'_short = sd_`predicted'_short/wage_adj_`predicted'_short
		label variable coeff_var_`predicted'_short  "CV = sd(wage_adj - `predicted')/wage_adj - shortened sample"
	}
		
		
	merge 1:1 ssnrep using "$data/coeff_var_annual.dta"
	keep if _merge == 1 | _merge == 3
	drop _merge

	
	
	 save "$data/collapsed_data_2yr.dta", replace

	 
log close 

/** END **/ 
	 
	 
